library(tidyverse)
library(lubridate)
library(forcats)
library(skimr)
library(plotly)
library(ggpubr)
library(kableExtra)
library(gridExtra)
library(e1071)
mykable <- function(dt, ...) {
kbl(dt, ...) %>% kable_material(c("striped", "hover", "condensed", "responsive"), full_width = F)
}rawdata <- readr::read_csv("DATA2X02 class survey 2020 (Responses) - Form responses 1.csv")
data <- rawdata %>%
janitor::clean_names() %>%
mutate(
timestamp = lubridate::dmy_hms(timestamp)
) %>%
rename(
covid = how_many_times_have_you_been_tested_for_covid,
postcode = postcode_of_where_you_live_during_semester,
dentist = how_long_has_it_been_since_you_last_went_to_the_dentist,
uni_work = on_average_how_many_hours_per_week_did_you_spend_on_university_work_last_semester,
social_media = what_is_your_favourite_social_media_platform,
dog_cat = did_you_have_a_dog_or_a_cat_when_you_were_a_child,
parents = do_you_currently_live_with_your_parents,
exercise = how_many_hours_a_week_do_you_spend_exercising,
eye_colour = what_is_your_eye_colour,
asthma = do_you_have_asthma,
paid_work = on_average_how_many_hours_per_week_did_you_work_in_paid_employment_in_semester_1,
season = what_is_your_favourite_season_of_the_year,
shoe = what_is_your_shoe_size,
height = how_tall_are_you,
floss = how_often_do_you_floss_your_teeth,
glasses = do_you_wear_glasses_or_contacts,
hand = what_is_your_dominant_hand,
steak = how_do_you_like_your_steak_cooked,
stress = on_a_scale_from_0_to_10_please_indicate_how_stressed_you_have_felt_in_the_past_week
) %>%
mutate(
gender = case_when(
grepl("f", tolower(gender), fixed=T) ~ "Female",
grepl("m", tolower(gender), fixed=T) ~ "Male",
TRUE ~ "Non-binary"
) %>% as.factor(),
dentist = factor(dentist, levels = c("Less than 6 months", "Between 6 and 12 months",
"Between 12 months and 2 years", "More than 2 years", NA),
ordered = T) %>%
recode(
`Less than 6 months` = "< 6 months",
`Between 6 and 12 months` = "6 - 12 months",
`Between 12 months and 2 years` = "12 months - 2 years",
`More than 2 years` = "> 2 years"
),
dog_cat = as_factor(dog_cat),
parents = as_factor(parents),
postcode = as.factor(postcode),
asthma = as_factor(asthma),
season = factor(season, levels = c("Summer", "Autumn", "Winter", "Spring"), ordered = T),
floss = factor(floss, levels = c("Less than once a week", "Weekly", "Most days",
"Every day", NA), ordered = T),
glasses = as_factor(glasses),
hand = case_when(
hand == "Right handed" ~ "Right",
hand == "Left handed" ~ "Left",
TRUE ~ "Ambidextrous"
) %>% as_factor(),
steak = factor(steak, levels = c(
"Rare", "Medium-rare", "Medium", "Medium-well done", "Well done",
"I don't eat beef"
), ordered = T)
)data <- data %>%
mutate(
# Some heights were given in metres instead of centimetres
height = case_when(
height < 2.3 ~ height * 100,
T ~ height
),
# some eye colours are misspelt
eye_colour = tolower(eye_colour),
eye_colour = stringr::str_replace(eye_colour, pattern="dark ",
replacement = ""),
eye_colour = case_when(
eye_colour == "balck" ~ "black",
TRUE ~ eye_colour
) %>% forcats::fct_lump_n(5, other_level="other"),
# fix up social media by taking the first three characters as an identifier
social_media = case_when(
grepl("ess", social_media, fixed=T) ~ "messenger",
T ~ social_media
),
social_media = tolower(social_media) %>%
stringr::str_sub(1, 3),
social_media = case_when(
social_media == "fac" ~ "Facebook",
social_media == "ins" ~ "Instagram",
social_media == "mes" ~ "Messenger",
social_media == "red" ~ "Reddit",
social_media == "tik" ~ "Tiktok",
social_media == "twi" ~ "Twitter",
social_media == "wec" ~ "WeChat",
social_media == "you" ~ "YouTube",
T ~ social_media
) %>%
forcats::fct_lump_n(8, other_level="Other")
) %>%
filter(
exercise < 60 | is.na(exercise),
!(is.na(exercise) & is.na(parents) & is.na(covid) & is.na(dentist))
)Selection bias / Sampling bias: the sample does not accurately represent the population. (e.g. As the class survey is published on Ed, students who spend more time on checking Ed post are more likely to complete the survey.)
Non-response bias: certain groups are under-represented because they elect not to participate
Measurement or designed bias: bias factors in the sampling method influence the data obtained.
randomized controlled double-blind study:
investigator randomly allocate the subject into a treatment group and a control group. The control group is given a placebo but both the subject and investigators don’t know the identity of the groups.
the design is good because we expect the 2 groups to be similar thus any difference in the responses is likely to be caused by the treatment.
controlled vs observational:
A good randomized controlled experiment can establish causation
An observational study can only establish association. It may suggest causation but can’t prove causation.
Confounding occurs when the treatment group and control group differ by some third variable than the treatment) which influences the response that is studied.
Controlling for confounding: make groups more comparable by dividing them into subgroups with respect to the confounders. (e.g. if alcohol consumption is a potential confounding factor, then divide subjects into heavy drinkers, medium drinkers and light drinkers)
Figure 2.1: Visualising the missingness in the data.
data(kyphosis, package = "rpart")
# dplyr::glimpse(kyphosis)
truth = kyphosis$Kyphosis
prediction = ifelse(kyphosis$Start >= 9,
"Predict absent (S+)",
"Predict present(S-)")
tab = table(prediction, truth)
a = tab[1,1]
b = tab[1,2]
c = tab[2,1]
d = tab[2,2]
FN = c/(a+c) # false negative
FP = b/(b+d) # false positive
sen = a/(a+c) # sensitivity / recall
spe = d/(b+d) # specificity
pre = a/(a+b) # positive predictive / precision
NP = d/(c+d) # negative predictive
acc = (a+d)/(a+b+c+d) # accuracyFalse negative rate is 0.12; False positive rate is 0.35; Sensitivity/Recall is 1; Specificity is 0.65; Precision/Positive predictive value is 0.9; Negative predictive value is 0.58; Accuracy is 0.83.
Prospective / cohort study: subjects are initially identified as disease-free and classified by presence or absence of a risk factor.
Retrospective / case control study: take random samples from each of the two outcome categories which are followed back to determine the presence or absence of the risk factor.
Relative risk: The relative risk is the ratio of the probability of having the disease in the group with the risk factor to the probability of having the disease in the group without the risk factor. \(RR=\frac{P(D^+|R^+)}{P(D^+|R^-)}\)
Odds ratio:
data(kyphosis, package = "rpart")
# dplyr::glimpse(kyphosis)
truth = kyphosis$Kyphosis
prediction = ifelse(kyphosis$Start >= 9,
"Predict absent (S+)",
"Predict present(S-)")
tab = table(prediction, truth)
a = tab[1,1]
b = tab[1,2]
c = tab[2,1]
d = tab[2,2]
rr = (a*(c+d))/(c*(a+b))
or = (a*d)/(b*c)
se = sqrt(1/a+1/b+1/c+1/d)
CI = exp(log(or)+c(-1,1)*1.96*se)Expectations of random variable:
For \(T=\sum^n_{i=1}X_i\):
Sampling:
Importance of SE: it tells us the likely size of the estimation error so that we know how accurate or reliable the estimate is.
Two-sided discrepancies of interest:
False alarm rate / Significance level: a given value \(\mu_0\) is rejected incorrectly.
Normal population: use the t-distribution: under the special statistical model where the data are modeled as values taken by iid normal random variables, if the true population mean is indeed \(\mu_0\), then the ratio \(\frac{\bar X-\mu_0}{S/ \sqrt n}\)~\(t_{n-1}\). Thus we choose c such that \(P(|\bar X-\mu_0|>c\frac{S}{\sqrt n})=P(\frac{|\bar X-\mu_0|}{S/\sqrt n}>c)=P(t_{n-1}>c)=\alpha\).
t.test()##
## One Sample t-test
##
## data: data$uni_work
## t = -75.688, df = 137, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 130
## 99 percent confidence interval:
## 24.83204 31.84912
## sample estimates:
## mean of x
## 28.34058
##
## One Sample t-test
##
## data: data$uni_work
## t = -75.688, df = 137, p-value < 2.2e-16
## alternative hypothesis: true mean is less than 130
## 99.5 percent confidence interval:
## -Inf 31.84912
## sample estimates:
## mean of x
## 28.34058
For a test of \(H_0:\mu=\mu_0\) vs \(H_1:\mu>\mu_0\):
Hypothesis: \(H_0:\mu=375\) vs \(H_1:\mu<375\)
Assumptions: \(X_i\) are independently and identically distributed and follow \(N(\mu,\sigma^2)\)
n = length(data$uni_work)
df = n-1
mu = 130
S = sd(data$uni_work)
t_0.05 = qt(0.05,n)
xbar = mu - t_0.05*S/sqrt(n)
pop_mean = mean(data$uni_work)Test statistic: \(T=\frac{\bar X-\mu_0}{S/ \sqrt n}\). Under \(H_0\), test statistic follows a t distribution with $n-1=137 degree of freedom.
Rejection region:
\(\frac{\bar X-\mu}{s/\sqrt n}<t_{n-1}(0.05)\)
\(\bar X<\mu+t_{n-1}(0.05)s/\sqrt n\)
\(\bar X<\mu+-1.66 \times 15.78/ \sqrt 138\)
\(\bar X<132.22\)
Decision:
library(pwr)
# population sd = 0.294, sample size = 6, power: 80% sure of "detecting" that mu is not equal to 375, two-sided, significant level=0.05, how much lower than 375 does mu need to be?
res = pwr.t.test(n = 6, d = NULL, sig.level = 0.05, power = 0.8, type = "one.sample", alternative = "two.sided")
res$d*0.294## [1] 0.4217541
# mu = 374.87, population sd=0.294, power: 80% sure of "detecting" that mu is not equal 75, two-sided, what sample size n would be needed?
res = pwr.t.test(n = NULL, d = (374.87-375)/0.294, sig.level = 0.05, power = 0.8,
type = "one.sample", alternative = "two.sided")
res$n## [1] 42.10456
Hypothesis: \(H_0:p_1=p_{10},p_2=p_{20},...,p_k=p_{k0}\) vs \(H_1:\) at least one equality does not hold.
Assumptions: - independent observations - expected counts are all greater than 5 (i.e. \(e_i=np_{i0}\geq 5\))
y = c(102, 32, 12, 4)
# y = as.data.frame(table(x$covid_test))$Freq
p = c(0.69, 0.21, 0.07,0.03)
y_i = c(y[1:2],sum(y[3:4]))
p_i = c(p[1:2],sum(p[3:4]))
n = sum(y_i)
e_i = n * p_i
e_i >=5## [1] TRUE TRUE TRUE
Test statistic: \(T = \sum_{i=0}^k\frac{(Y_{i} - e_{i})^2}{e_{i}}\).
Observed test statistic: \(t_0 = \sum_{i=0}^k\frac{(y_{i} - e_{i})^2}{e_{i}}\) = 0.1
##
## Chi-squared test for given probabilities
##
## data: y_i
## X-squared = 0.096342, df = 2, p-value = 0.953
P-value:\(P(T\geq t_0)=P(X^2_{`k-1-q`}\geq\) 0.1) = 0.953
Decision:
With the identity of a Poisson distribution that \[X \sim \text{Poisson}(\lambda) \Longrightarrow E[X] = \lambda\]. The estimated lambda is calculated by: \[ \hat{\lambda} = \bar{x} = \sum_{k=1}^n\frac{x_k}n \]
tab <- table(data$covid) %>% as.data.frame() %>% mutate(
Var1 = as.numeric(as.character(Var1)),
pois = dpois(Var1, weighted.mean(Var1, Freq)) * sum(Freq)
)
(ggplot(tab) + geom_bar(aes(x = Var1, y = Freq, fill = "Data"), stat='identity') +
geom_line(aes(x = Var1, y = pois, fill = "Poisson"), colour = "blue") +
theme_linedraw(base_size= 18) +
scale_fill_manual(values = c("Data" = "red", "Poisson" = "blue")) +
labs(fill = "", x = "Number of COVID tests", y = "Count")) %>%
ggplotly()Figure 8.1: Data with overlayed Poisson distribution
table <- table(data$covid, dnn=c("num_tests")) %>% as.data.frame() %>% mutate(
num_tests = as.numeric(as.character(num_tests)),
pois = dpois(num_tests, weighted.mean(num_tests, Freq)) * sum(Freq)
)
mykable(table,
col.names = c("Number of Tests", "Observed Count", "Expected Counts from Poisson"),
digits = 2,
caption = "Data and expected counts")| Number of Tests | Observed Count | Expected Counts from Poisson |
|---|---|---|
| 0 | 101 | 80.72 |
| 1 | 22 | 43.29 |
| 2 | 8 | 11.61 |
| 3 | 2 | 2.07 |
| 4 | 1 | 0.28 |
| 5 | 2 | 0.03 |
| 6 | 1 | 0.00 |
| 10 | 1 | 0.00 |
Hypothesis: \(H_0:p_1=p_{10},p_2=p_{20},...,p_k=p_{k0}\) vs \(H_1:\) at least one equality does not hold.
Assumptions: - independent observations - expected counts are all greater than 5 (i.e. \(e_i=np_{i0}\geq 5\))
tab = as.data.frame(table(data$covid))
y = c(tab$Freq)
x = as.numeric(as.character(tab$Var1))
n = sum(y)
k = length(y)
(lam = sum(y*x)/n)## [1] 0.5362319
## [1] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
# if some Freq < 5:
ey = c(ey[1:2],sum(ey[3:k]))
y = c(y[1:2],sum(y[3:k]))
k = length(y)
q = 1
df = k-1-qTest statistic: \(T = \sum^k_{i=1}\frac{(Y_i-np_i)^2}{np_i}\), under \(H_0\), degree of freedom is \(k-1-q\) = 1 , where k is the number of groups and q is the number o f parameters that needs to be estimated from the data.
Observed test statistic: With the observed frequencies \(y_i\) from the data and estimated parameter \(\lambda\) = 0.5362, \(t_0\) = 15.63
P-value: \(P(T\geq t_0)=P(\chi^2_{1}\geq\) 15.63) = 10^{-4}
Decision
Since the p-value is greater than 0.05, we do not reject the null hypothesis. The data are consistent with a Poisson distribution.
Since the p-value is smaller than 0.05, we reject the null hypothesis. The data does not follow a Poisson distribution.
Test of homogeneity: Test whether the probability distributions of the categories are the same over the different populations.
Hypothesis: \(H_0:p_{1j}=p_{2j}\) for \(j=1,2,3\) vs \(H_1: p_{11}\neq p_{21}, p_{21}\neq p_{22}\).
n=sum(tab)
r = nrow(tab)
c = ncol(tab)
yr = apply(tab, MARGIN = 1, FUN = sum)
yc = apply(tab, MARGIN = 2, FUN = sum)
etab = yr %*% t(yc) / n
etab >=5## Female Male Non-binary
## [1,] TRUE TRUE FALSE
## [2,] TRUE TRUE FALSE
Assumptions: \(e_{ij}=\frac{y_iy_j}{n} \geq 5\).
Test statistic: \(T=\sum^r_{i=1}\sum^c_{j=1}\frac{Y_{ij}-e_{ij}^2}{e_{ij}}\). Under \(H_0\), the degree of freedom is \((r-1)(c-1)\) = 1, where r is the number of rows and c is the number of columns in contingency table.
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 11.635, df = 2, p-value = 0.002975
Observed test statistic: \(\sum^{2}_{i=1}\sum^{3}_{j=1}\frac{y_{ij}-e_{ij}^2}{e_{ij}}\).
P-value: \(P(T \geq 11.63) = P(\chi^2_{2} \geq 11.63) = 0.003\)
Decision:
Hypothesis: \(H_0:p_{ij}=p_ip_j\) for \(i=1,2,...,r,j=1,2,...,c\) vs \(H_1:\) Not all equalities hold.
n=sum(tab)
r = nrow(tab)
c = ncol(tab)
yr = apply(tab, MARGIN = 1, FUN = sum)
yc = apply(tab, MARGIN = 2, FUN = sum)
etab = yr %*% t(yc) / n
etab >=5## Female Male Non-binary
## [1,] TRUE TRUE FALSE
## [2,] TRUE TRUE FALSE
Assumptions: all expected counts are greater or equal to 5 (i.e.\(e_{ij}=\frac{y_iy_j}{n} \geq 5\))
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 11.635, df = 2, p-value = 0.002975
Test statistic: \(T=\sum^r_{i=1}\sum^c_{j=1}\frac{(Y_{ij}-e_{ij})^2}{e_{ij}}\). Under \(H_0\), the degree of freedom is \((r-1)(c-1)=\) 2, where c is the number of columns and r is the number of rows in the contingency table.
Observed statistic: \(t_0=\sum^{2}_{i=1}\sum^{3}_{j=1}\frac{(y_{ij}-{y_iy_j/n})^2}{y_iy_j/n}=\) 11.63
P-value: \(P(T\geq 11.63) = P(\chi^2_{2}\geq 11.63) = 0.003\)
Decision:
Fisher’s exact test: Consider all possible permutations of 2 by 2 contingency table whit the same marginal totals. Then calculate how many of these were equal to or “more extreme” than what we observed. As such, the Fisher’s exact test does not require the expected cell counts to be \(\geq 5\).
Drawbacks:
Hypothesis: \(H_0:\) there is no association between … and … vs \(H_1:\) there is an association between … and …
Assumptions: Consider all possible permutations of 2 by 2 contingency table whit the same marginal totals. Then calculate how many of these were equal to or “more extreme” than what we observed. As such, the Fisher’s exact test does not require the expected cell counts to be \(\geq 5\).
tab = table(data$parents,data$gender)
# fisher = fisher.test(tab) # two-sided
(fisher = fisher.test(tab,alternative = "greater"))##
## Fisher's Exact Test for Count Data
##
## data: tab
## p-value = 0.00157
## alternative hypothesis: greater
The degrees of freedom is not calculated as it is not relevant to fisher’s exact test.
Decision:
Yate’s correction: Apply continuity correction with a chi-squared test, using the identity \(P(X\leq x) \approx P(Y\leq x+0.5)\) and \(P(X\geq x) \approx P(Y\geq x-0.5)\).
Hypothesis: \(H_0:\) there is no association between … and … vs \(H_1:\) there is an association between … and …
Assumption: Yate’s correction applies continuity correction to approximate integer-valued variable, therefore, it does not restrict the cell counts.
tab = table(data$parents,data$gender)
r=nrow(tab)
c=ncol(tab)
df = (r-1)*(c-1)
(yate = chisq.test(tab,correct=TRUE))##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 11.635, df = 2, p-value = 0.002975
Test statistic: \(T=\sum^r_{i=1}\sum^c_{j=1}\frac{(|Y_{ij}-e_{ij}|-0.5)^2}{e_{ij}}\), which approximately follows a \(\chi^2_{(r-1)(c-1)}\) distribution under \(H_0\). The degree of freedom is \((r-1)(c-1)=\) 2, where r is the number of rows and c is the number of columns in the contingency table.
Observed test statistic: \(t_0=\sum^{2}_{i=1}\sum^{3}_{j=1}\frac{(|y_{ij}-e_{ij}|-0.5)^2}{e_{ij}}\) = 11.63.
P-value: \(P(T\geq 11.63) = P(\chi^2_{2}\geq 11.63) = 0.003\)
Decision:
Monte Carlo simulation: Resample (i.e. randomly generate contingency tables) and perform chi-squared tests by many times. Calculate the test statistic for each of the resamples and create a sampling distribution of test statistics. P-value is calculated by determining the proportion of the resampled test statistics \(\geq\) the observed test statistic.
Hypothesis: \(H_0:\) there is no association between … and … vs \(H_1:\) there is an association between … and …
Assumptions: No assumptions are made about the underlying distribution of the population. The cell counts are also not restricted for Monte Carlo simulation.
To calculate the p-value, a Monte Carlo simulation is perform, with 10000 simulations of chi-squared test. Note that degree of freedom is not considered as it is not relevant to the Monte Carlo simulation.
##
## Pearson's Chi-squared test with simulated p-value (based on 10000
## replicates)
##
## data: tab
## X-squared = 11.635, df = NA, p-value = 0.0015
Test statistics: the test statistic is calculated for each of the resamples by \(T=\sum^r_{i=1}\sum^c_{j=1}\frac{(Y_{ij}-e_{ij})^2}{e_{ij}}\).
Observed test statistic: \(t_0=\sum^r_{i=1}\sum^c_{j=1}\frac{(y_{ij}-e_{ij})^2}{e_{ij}}=\) 11.63
P-value: \(P(T\geq\) 11.63) = \(P(\chi^2\geq\) 11.63) = 0.0015
Decision:
mu = 30
p1 = ggplot(data, aes(x= uni_work)) +
geom_histogram(bins = 12, show.legend = F) +
theme_linedraw(base_size = 18) +
labs(y = "Count", x = "Uni Work (hours)") +
geom_boxplot(aes(x = uni_work, y = 60), outlier.alpha = 0.5, width = 18) +
geom_vline(xintercept= mu,colour = "blue",linetype = "dashed") # the tested mean
p2 = ggqqplot(data, x = "uni_work") +
theme_linedraw(base_size = 18) + theme(legend.position = "none")
grid.arrange(p1, p2, ncol=2)Figure 12.1: Distribution of data with blue line indicating the tested mean of …
The data doesn’t follow a normal distribution if:
From Figure 16.4, …
Hypothesis: \(H_0: \mu =30\) vs \(H_1: \mu ><\neq 30\)
Assumptions:
n = length(data$uni_work)
df = n-1
mean = mean(data$uni_work)
mu = 30
S = sd(data$uni_work)
(test = t.test(data$uni_work,mu = mu,alternative = "two.sided"))##
## One Sample t-test
##
## data: data$uni_work
## t = -1.2355, df = 137, p-value = 0.2188
## alternative hypothesis: true mean is not equal to 30
## 95 percent confidence interval:
## 25.68461 30.99655
## sample estimates:
## mean of x
## 28.34058
t0 = test$statistic
pval = test$p.value
# for greater or less than 30:
# t0 = t.test(x,mu = mu,alternative = "greater")$statistic
# pval = t.test(x,mu = mu,alternative = "greater")$p.value
#
# t0 = t.test(x,mu = mu,alternative = "less")$statistic
# pval = t.test(x,mu = mu,alternative = "less")$p.valueTest statistic: \(T=\frac{\bar{X}-\mu_0}{s/\sqrt{n}}\). Under \(H_0\), the data tends to follow t distribution with \(n-1=\) 137 degree of freedom.
Observed test statistic: \(t_0=\frac{28.34-30}{15.78/\sqrt{138}}=-1.24\)
P-value: \(P(t_{137}\leq -1.24)=0.2188\)
Decision:
Two-sample t-test: test whether the population mean of two samples are different.
Welch two-sample t-test: does not assume equal population variances.
sum = data %>% group_by(gender) %>%
summarise(Mean = mean(exercise),
Median = median(exercise),
SD = sd(exercise),
Variance = var(exercise),
n = n())
colnames(sum)=c("Gender","Mean","Median","SD","Variance","Counts")
knitr::kable(sum,digits = 1, align="cccccc",caption = "Statistics of number of hours spent on exercising by different genders.") %>% kable_paper(full_width = F, html_font = "Cambria")| Gender | Mean | Median | SD | Variance | Counts |
|---|---|---|---|---|---|
| Female | 3.4 | 3 | 2.9 | 8.4 | 46 |
| Male | 5.0 | 5 | 3.4 | 11.7 | 91 |
| Non-binary | 5.0 | 5 | NA | NA | 1 |
ggplot(data %>% filter(gender != "Non-binary"), aes(x= exercise)) +
geom_histogram(aes(fill = gender), bins = 12, show.legend = F) +
facet_grid(rows = vars(gender)) +
theme_linedraw(base_size = 18) +
labs(y = "Count", x = "Exercise per week (hours)") +
geom_boxplot(aes(x = exercise, y = 45), outlier.alpha = 0, width = 18) +
geom_jitter(aes(x = exercise, y = 45, colour=gender, alpha = 0.5), height = 8, show.legend=F)Figure 12.2: Distribution of …
ggqqplot(data %>% filter(gender != "Non-binary"), x = "exercise", color = "gender") +
facet_grid(rows = vars(gender)) +
theme_linedraw(base_size = 18) + theme(legend.position = "none")Figure 12.3: Q-Q plots of data in different groups.
The data doesn’t follow a normal distribution if:
Hypothesis: \(H_0:\mu_x=\mu_y\) vs \(H_1:\mu_x >\mu_y\) or \(\mu_x \leq \mu_y\) or \(\mu_x \neq \mu_y\)
Assumptions: - Variables \(X,Y\) are identically and independently distributed and follow \(N(\mu_X,\sigma^2)\) and \(N(\mu_Y,\sigma^2)\). - Observations are independent to each other - Regular two-sample t-test assumes equal population variances of variables while Welch two-sample t-test does not assume equal variance.
x = data %>% filter(gender =="Female")
x= data$exercise
y = data %>% filter(gender =="Male")
y = data$exercise
nx = length(x)
ny = length(y)
sx = sd(x)
sy = sd(y)
sp2 = ((nx-1)*(sx^2)+(ny-1)*(sy^2))/(nx+ny-2)
sp = sqrt(sp2)
xbar = mean(x)
ybar = mean(y)
(test = t.test(x,y, alternative="two.sided"))##
## Welch Two Sample t-test
##
## data: x and y
## t = 0, df = 274, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.7864101 0.7864101
## sample estimates:
## mean of x mean of y
## 4.478261 4.478261
Test statistic: \(T=\frac{\bar{X}-\bar{Y}}{S_p \sqrt{\frac{1}{n_1}+\frac{1}{n_2}}}\), where \(S^2_p=\frac{(n_x-1)S^2_x+(n_y-1)S^2_y}{n_x+n_y-2}\). Under \(H_0\), the data tend to follow a t distribution with 274 degree of freedom.
Observed test statistic: \(t_0=\frac{4.48 - 4.48}{3.32 \sqrt{\frac{1}{138}+\frac{1}{138}}}\), where \(S^2_p=\frac{(138 -1)3.32^2+(138-1)3.32^2}{138+138 -2}=0\)
P-value:
Decision:
Paired samples t-test: measure twice with the same population. (e.g. Blood samples from individuals before and after they smoked a cigarette). The differences between two variables are usually calculated to perform one-sample t-test.
before = c(25, 25, 27, 44, 30, 67, 53, 53, 52)
after = c(27, 29, 37, 36, 46, 82, 57, 80, 61)
df = data.frame(before,after, diff = after-before)
p1 = ggplot(df, aes(x= diff)) +
geom_histogram(bins = 10, show.legend = F) +
theme_linedraw(base_size = 18) +
labs(y = "Count", x = "Difference") +
geom_boxplot(aes(x = diff, y = 20), outlier.alpha = 0.5, width = 18)
p2 = ggqqplot(df, x = "diff") +
theme_linedraw(base_size = 18) + theme(legend.position = "none")
grid.arrange(p1, p2, ncol=2)The data doesn’t follow a normal distribution if: - A lower bend of zero exist? - Not close enough to the line to be considered ‘normal’.
Hypothesis:\(H_0:\mu_d=0\) vs \(H_1:\mu_d\neq 0\)
Assumptions: differences between two samples are independent and identically distributed, with the identity \(N(\mu_d,\sigma^2)\).
n = length(df$diff)
sd = sd(df$diff)
dbar = mean(df$diff)
t.test(after,before, data = df,paired = TRUE)##
## Paired t-test
##
## data: after and before
## t = 2.6374, df = 8, p-value = 0.02984
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.10285 16.45271
## sample estimates:
## mean of the differences
## 8.777778
pval = t.test(after,before, data = df,paired = TRUE)$p.value
t0 = t.test(after,before, data = df,paired = TRUE)$statistic
df = n-1Test statistic: \(T=\frac{\bar D}{S_d / \sqrt{n}}\). Under \(H_0\), test statistic tends to follow a t distribution with \(n-1=8\) degree of freedom.
Observed test statistic: \(t_0=\frac{8.78}{9.98/ \sqrt{9}}\)
P-value: \(2P(t_{8 \geq |2.6373657|})=0.0298\) or \(P(t_{8 \leq |2.6373657|})=0.0298\) or \(P(t_{8 \geq |2.6373657|})=0.0298\)
Decision:
Sign test: is used to test \(H_0:\mu = \mu_0\) and paired data when normality is not satisfied.
drawback: it ignores all the information on magnitude and hence has low power.
The sign test is a non-parametric test as no assumption on the data distribution is made except symmetry.
mu = 30
p1 = ggplot(data, aes(x= uni_work)) +
geom_histogram(bins = 12, show.legend = F) +
theme_linedraw(base_size = 18) +
labs(y = "Count", x = "Uni Work (hours)") +
geom_boxplot(aes(x = uni_work, y = 60), outlier.alpha = 0.5, width = 18) +
geom_vline(xintercept= mu,colour = "blue",linetype = "dashed") # the tested mean
p2 = ggqqplot(data, x = "uni_work") +
theme_linedraw(base_size = 18) + theme(legend.position = "none")
grid.arrange(p1, p2, ncol=2)Figure 13.1: Distribution of the data.
diff = data$uni_work-mu
n = length(sign(diff)[sign(diff) != 0])
freq = as.vector(table(sign(diff)[sign(diff) != 0]))
binom.test(freq,p=0.5,alternative="greater")##
## Exact binomial test
##
## data: freq
## number of successes = 65, number of trials = 117, p-value = 0.1336
## alternative hypothesis: true probability of success is greater than 0.5
## 95 percent confidence interval:
## 0.4753134 1.0000000
## sample estimates:
## probability of success
## 0.5555556
Hypothesis: \(H_0:\mu=30\) vs \(H_1:\mu>30, \mu<30,\mu \neq 30\)
Assumptions: Observations are independently sampled from a symmetric distribution.
Test statistic: \[T=number~of (D_i>0)\] where \(D_i=X_i-30\). Under \(H_0\), the test statistic follows a binomial distribution with identity \(B(n,\frac{1}{2})\) where n is the number of non-zero differences.
Observed test statistic: \(t_0=number~of(d_i>0)=52\)
P-value:
Conclusion:
Sign test can be used to test differences between paired data when normality is not satisfied.
Hypothesis: \(H_0:p_+=\frac{1}{2}\) vs \(H_1:p_+>\frac{1}{2}\)
Assumptions: Differences \(D_i\) are independent.
before = c(27, 25, 27, 44, 30, 67, 53, 53, 52)
after = c(27, 29, 37, 36, 46, 82, 57, 80, 61)
df = data.frame(before,after, diff = after-before)
n = length(df$diff)
freq = as.vector(table(sign(df$diff)[sign(df$diff) != 0]))
t0 = sum(df$diff>0)
binom.test(t0,n,p=0.5,alternative="greater")##
## Exact binomial test
##
## data: t0 and n
## number of successes = 7, number of trials = 9, p-value = 0.08984
## alternative hypothesis: true probability of success is greater than 0.5
## 95 percent confidence interval:
## 0.4503584 1.0000000
## sample estimates:
## probability of success
## 0.7777778
Test statistic: Let \(T\) be the number of positive differences out of the 9 non-zero differences. Under \(H_0\), the test statistic follows a binomial distribution with the identity \(B(9,\frac{1}{2})\).
Observed test statistic: We observed \(t_0=7\) positive differences in the sample.
P-value: probability of getting a test statistic as or more extreme than what we observed, \(P(T \geq 7)=1-P(T \leq 6)=1-pbinom(6,size=9,prob=\frac{1}{2})\approx 0.0898\)
Conclusion:
As p-value is smaller than 0.05, we reject the null hypothesis. The population mean of two samples are the same.
As p-value is smaller than 0.05, we reject the null hypothesis. The population mean of two samples are different.
Wilcoxon signed-rank test:
mu = 30
p1 = ggplot(data, aes(x= uni_work)) +
geom_histogram(bins = 12, show.legend = F) +
theme_linedraw(base_size = 18) +
labs(y = "Count", x = "Uni Work (hours)") +
geom_boxplot(aes(x = uni_work, y = 60), outlier.alpha = 0.5, width = 18) +
geom_vline(xintercept= mu,colour = "blue",linetype = "dashed") # the tested mean
p2 = ggqqplot(data, x = "uni_work") +
theme_linedraw(base_size = 18) + theme(legend.position = "none")
grid.arrange(p1, p2, ncol=2)Figure 14.1: Distribution of the data.
Hypothesis: \(H_0:\mu=30\) vs \(H_1:\mu>30, \mu<30,\mu \neq 30\)
Assumptions: Observations are independently sampled from a symmetric distribution.
diff = data$uni_work-mu
# for paired data
test = wilcox.test(y,x,alternative="greater",paried=TRUE)
# without continuity correction
test = wilcox.test(diff,alternative="greater",correct = FALSE)
# for one-sample
(test=wilcox.test(diff, alternative = "greater"))##
## Wilcoxon signed rank test with continuity correction
##
## data: diff
## V = 2886.5, p-value = 0.9395
## alternative hypothesis: true location is greater than 0
Test statistic:
Observed test statistic:
P-value:
Conclusion:
For large enough \(n\), we can use normal distribution to approximate the distribution of the sign rank test statistic: \(W^+\)~\(N(\frac{n(n+1)}{4},\frac{n(n+1(2n+1))}{24})\)
mu = 30
p1 = ggplot(data, aes(x= uni_work)) +
geom_histogram(bins = 12, show.legend = F) +
theme_linedraw(base_size = 18) +
labs(y = "Count", x = "Uni Work (hours)") +
geom_boxplot(aes(x = uni_work, y = 60), outlier.alpha = 0.5, width = 18) +
geom_vline(xintercept= mu,colour = "blue",linetype = "dashed") # the tested mean
p2 = ggqqplot(data, x = "uni_work") +
theme_linedraw(base_size = 18) + theme(legend.position = "none")
grid.arrange(p1, p2, ncol=2)Figure 14.2: Distribution of the data.
Hypothesis: \(H_0:\mu=30\) vs \(H_1:\mu>30, \mu<30,\mu \neq 30\)
Assumptions: Observations are independently sampled from a symmetric distribution.
diff = data$uni_work-mu
n = length(diff)
mean = n*(n+1)/4
var = n*(n+1)*(2*n+1)/24
w_calc = data.frame(
dif = diff,
absDif = abs(diff),
rankAbsDif = rank(abs(diff)),
signrank = sign(diff)*rank(abs(diff))
)
w_pos = w_calc %>%
filter(signrank>0) %>%
summarise(sum(signrank)) %>%
pull()
w_neg = w_calc %>%
filter(signrank<0) %>%
summarise(abs(sum(signrank))) %>%
pull()
# for one-sided:
w = w_pos
t0 = (w-mean)/(sqrt(var))
pval = pnorm(t0) # for mu < mu_0
pval = 1-pnorm(t0) # for mu > mu_0
# for two-sided:
w = min(c(w_pos,w_neg))
t0 = (w-mean)/(sqrt(var))
pval = 2*pnorm(t0)Test statistic: \(W=min(W^+,W^-)\) where \(W^+=\sum_{i:D_i>0}R_i\), \(W^-=\sum_{i:D_i<0}R_i,\) \(D_i=X_i-30\) and \(R_i\) are the ranks of \(|D_1|,|D_1|,...,|D_n|\). Under \(H_0\), the test statistic, with identity \(W^+\)~\(WSR(138)\), follows a symmetric distribution with mean \(E(W^+)=\frac{n(n+1)}{4}=4795.5\) and \(Var(W^+)=\frac{n(n+1)(2n+1)}{24}=2.2139225\times 10^{5}\).
Observed test statistic: the test statistic is found by determining differences between observations and 30 \(D_i=X_i-\mu_0\), followed by assigning the signed ranks of \(D_i\). The sum of positive ranks (\(w^+\)) and the sum of negative ranks (\(w^-\)) are calculated.
By using normal approximation, test statistic can be calculated by: \(t_0=\frac{w-E(w^+)}{\sqrt{Var(w^+)}}=\frac{3978.5-4795.5}{\sqrt{2.2139225\times 10^{5}}}=-1.74\)
P-value:
Conclusion:
Wilcoxon rank-sum test / Mann-Whitney U test:
# ggplot(data %>% filter(gender != "Non-binary"), aes(x= exercise)) +
# geom_histogram(aes(fill = gender), bins = 12, show.legend = F) +
# facet_grid(rows = vars(gender)) +
# theme_linedraw(base_size = 18) +
# labs(y = "Count", x = "Exercise per week (hours)") +
# geom_boxplot(aes(x = exercise, y = 45), outlier.alpha = 0, width = 18) +
# geom_jitter(aes(x = exercise, y = 45, colour=gender, alpha = 0.5), height = 8, show.legend=F)
ggplot(data %>% filter(gender != "Non-binary"),aes(x = exercise, fill = gender))+geom_density(alpha = 0.5)Figure 15.1: Comparison of number of hours per week exercised between females and males
Hypothesis: \(H_0:\mu_X=\mu_Y\) vs \(H_1:\mu_X\neq \mu_Y\)
Assumptions: Observations \(X_i,Y_i\) are independent and two samples follow the same kind of distribution but differ by a shift.
# without ties
male = data %>% filter(gender =="Male") %>% select(exercise)
male = male$exercise
female = data %>% filter(gender =="Female") %>% select(exercise)
female = female$exercise
(test = wilcox.test(male,female))##
## Wilcoxon rank sum test with continuity correction
##
## data: male and female
## W = 2753, p-value = 0.0025
## alternative hypothesis: true location shift is not equal to 0
nA=length(male)
nB=length(female)
N=nA+nB
nRatio = nA*(N+1)/2
t0 = test$statistic
pval = test$p.valueTest statistic: \(W=R_1+R_2+...+R_{n_X}\). Under \(H_0\), the test statistic follows the \(WRS(91,46)\) distribution.
Observed test statistic: \(w=r_1+r_2+...+r_{n_x}=2753\)
P-value:
Decision:
ggplot(data %>% filter(gender != "Non-binary"), aes(x= exercise)) +
geom_histogram(aes(fill = gender), bins = 12, show.legend = F) +
facet_grid(rows = vars(gender)) +
theme_linedraw(base_size = 18) +
labs(y = "Count", x = "Exercise per week (hours)") +
geom_boxplot(aes(x = exercise, y = 45), outlier.alpha = 0, width = 18) +
geom_jitter(aes(x = exercise, y = 45, colour=gender, alpha = 0.5), height = 8, show.legend=F)Hypothesis: \(H_0:\mu_X=\mu_Y\) vs \(H_1:\mu_X\neq \mu_Y\)
Assumptions: Observations \(X_i,Y_i\) are independent and two samples follow the same kind of distribution but differ by a shift.
rank = data %>% dplyr::mutate(
r = rank(exercise))
rank_sum = rank %>%
filter(gender != "Non-binary") %>%
dplyr::group_by(gender) %>%
dplyr::summarise(
w = sum(r),
xbar = mean(exercise),
s = sd(exercise),
n = n()
)
nA = rank_sum$n[rank_sum$gender=="Male"]
nB = rank_sum$n[rank_sum$gender=="Female"]
N = nA+nB
w = rank_sum$w[rank_sum$gender=="Male"]
EW = nA * (N+1)/2
sumsqrank = sum(rank$r^2)
g = N*(N+1)^2/4
var = nA*nB*(sumsqrank-g)/(N*(N-1))
t0 = (w-EW)/sqrt(var)
# for muX > muY:
pval = 1-pnorm(t0)
# for muX < muY:
pval = pnorm(t0)
# for two-sided:
pval = 2*(1-pnorm(abs(t0)))Test statistic: when there are ties, the p-value can be calculated using normal approximation to distribution of test statistic. The statistic follows a normal distribution with identity of \(N(0,1)\), and can be calculated by \(T=\frac{W-E(W)}{\sqrt{Var(W)}}\), where \(E(W)=\frac{n_x(N+1)}{2}=6279\) and \(Var(W)=\frac{n_xn_y}{N(N-1)}(\sum^N_{i=1}r^2-\frac{N(N+1)^2}{4})=5.1831631\times 10^{4}\).
Observed test statistic: \(w=r_1+r_2+...+r_{n_x}=6980.5\)
P-value As the exact \(WRS'(91,46)\) distribution with ties is unknown, we use a normal approximation to this distribution with \(E(W)=\)
Decision:
ggplot(data %>% filter(gender!="Non-binary"),
aes(y = exercise, x = gender,
colour = gender)) +
geom_boxplot(coef = 10) +
geom_jitter(width=0.1, size = 5) +
theme_bw(base_size = 36) +
theme(legend.position = "none") +
labs(y = "Exercise(h)",x = "Gender")Hypothesis: \(H_0:\mu_x=\mu_y\) vs \(H_1:\mu_x >\mu_y\)
Assumptions: The observation \(X_1,X_2,...,X_{n_x},Y_1,Y_2,...,Y_{n_y}\) are exchangeable, i.e. swapping labels on observations keeps the data just as likely as the original.
Test statistic: There is no outliers exist in the samples, the t-test statistic is used in the permutation test. The t-test test statistic is calculated by \(T=\frac{\bar{X}-\bar{Y}}{S_p \sqrt{\frac{1}{n_1}+\frac{1}{n_2}}}\), where \(S^2_p=\frac{(n_x-1)S^2_x+(n_y-1)S^2_y}{n_x+n_y-2}\). Note that although t-test statistic is used, the permutation test does not use the t distribution.
set.seed(100)
dat = data %>% filter(gender %in% c("Female","Male"))
# without outliers
ttest_t0 = t.test(exercise ~ gender, data = dat, var.equal=TRUE)$statistic
B = 10000
permuted_dat = dat
t_null = vector("numeric",B)
for(i in 1:B){
permuted_dat$gender = sample(dat$gender)
t_null[i]=t.test(exercise ~ gender, data = permuted_dat)$statistic
}
pval = mean(abs(t_null)>=abs(ttest_t0))Observed test statistic: the original test statistic is \(t_0=\frac{\bar{x}-\bar{y}}{S_p \sqrt{\frac{1}{n_1}+\frac{1}{n_2}}}\), where \(S^2_p=\frac{(n_x-1)S^2_x+(n_y-1)S^2_y}{n_x+n_y-2}=-2.74\).
Data is permuted by 10000 times and feed into t-test. The test statistics are recorded and distributed as below.
p1 = data.frame(t_null) %>%
ggplot(aes(x = t_null)) +
geom_histogram(binwidth = 0.1) +
# theme_linedraw(base_size = 36) +
labs(y = "Count",x = "Test statistic")
p2 = data.frame(abs_t_null = abs(t_null)) %>%
ggplot(aes(x = abs_t_null)) +
geom_histogram(binwidth = 0.1) +
geom_vline(xintercept = abs(ttest_t0),
col = "red", lwd = 0.5)+
labs(y = "Count",x = "Absolute test statistic")
grid.arrange(p1,p2,nrow=2)Figure 16.1: Distribution of test statistics.
P-value: the p-value, \(P(T\geq |-2.74|)=0.0072,\) is the proportion of test statistics from randomly permuted data being more extreme than the test statistic we observed.
Decision:
Hypothesis: \(H_0:\mu_x=\mu_y\) vs \(H_1:\mu_x >\mu_y\)
Assumptions: The observation \(X_1,X_2,...,X_{n_x},Y_1,Y_2,...,Y_{n_y}\) are exchangeable, i.e. swapping labels on observations keeps the data just as likely as the original.
Test statistic: there are outliers within samples, t-test testing the mean can be significantly affected by these outliers. Therefore, instead of t-test, the wilcoxon rank-sum test statistic which is robust to outliers is applied in the permutation test. The wilcoxon test statistic is calculated by \(W=R_1+R_2+...+R_{n_X}\)
set.seed(100)
dat = data %>% filter(gender %in% c("Female","Male"))
# with outliers
wilcox_t0 = wilcox.test(exercise ~ gender, data = dat)$statistic
B = 10000
permuted_dat = dat
t_null = vector("numeric",B)
for(i in 1:B){
permuted_dat$gender = sample(dat$gender)
t_null[i]=wilcox.test(exercise ~ gender, data = permuted_dat)$statistic
}
pval = mean(t_null>=abs(wilcox_t0))Observed test statistic: the original test statistic is \(w=r_1+r_2+...+r_{n_x}=1433\)
Data is permuted by 10000 times and feed into wilcoxon rank-sum test. The test statistics are recorded and distributed as below.
data.frame(t_null) %>%
ggplot(aes(x = t_null)) +
geom_histogram(binwidth = 10) +
geom_vline(xintercept = wilcox_t0,col = "red", lwd = 0.5) +
# theme_linedraw(base_size = 36) +
labs(y = "Count",x = "Test statistic")Figure 16.2: Distribution of test statistics.
P-value: the p-value, \(P(T\geq-2.74)=0.9985,\) is the proportion of test statistics from randomly permuted data being more extreme than the test statistic we observed.
Decision:
Hypothesis: \(H_0:\mu_x=\mu_y\) vs \(H_1:\mu_x >\mu_y\)
Assumptions: The observation \(X_1,X_2,...,X_{n_x},Y_1,Y_2,...,Y_{n_y}\) are exchangeable, i.e. swapping labels on observations keeps the data just as likely as the original.
Test statistic: there are outliers within samples, t-test testing the mean can be significantly affected by these outliers. Therefore, instead of t-test, MAD (median-absolute-deviation) test statistic which is robust to outliers is applied in the permutation test. The wilcoxon test statistic is calculated by \(T=\widetilde X-\widetilde Y\).
set.seed(100)
dat = data %>% filter(gender %in% c("Female","Male"))
# with outliers
mad_t0 = median(dat$exercise[dat$gender=="Male"])-median(dat$exercise[dat$gender=="Female"])
B = 10000
permuted_dat = dat
t_null = vector("numeric",B)
for(i in 1:B){
permuted_dat$gender = sample(dat$gender)
t_null[i]=median(permuted_dat$exercise[permuted_dat$gender=="Male"])-median(permuted_dat$exercise[permuted_dat$gender=="Female"])
}
pval = mean(t_null>=abs(mad_t0))Observed test statistic: the original test statistic is \(t_0=\widetilde x-\widetilde y=2\)
Data is permuted by 10000 times and feed into MAD test. The test statistics are recorded and distributed as below.
data.frame(t_null) %>%
ggplot(aes(x = t_null)) +
geom_histogram(binwidth = 0.4) +
geom_vline(xintercept = mad_t0,col = "red", lwd = 0.5) +
# theme_linedraw(base_size = 36) +
labs(y = "Count",x = "Test statistic")Figure 16.3: Distribution of test statistics.
P-value: the p-value, \(P(T\geq-2.74)=0.0365,\) is the proportion of test statistics from randomly permuted data being more extreme than the test statistic we observed.
Decision:
ggplot(data, aes(x= uni_work)) +
geom_histogram(bins = 12, show.legend = F) +
theme_linedraw(base_size = 18) +
labs(y = "Count", x = "Uni Work (hours)") +
geom_boxplot(aes(x = uni_work, y = 60), outlier.alpha = 0.5, width = 18) +
geom_vline(xintercept= 35,colour = "blue",linetype = "dashed") # the tested meanFigure 16.4: Distribution of data with blue line indicating the tested mean of …
Hypothesis: \(H_0: \mu =30\) vs \(H_1: \mu ><\neq 30\)
Assumptions: The observation \(X_1,X_2,...,X_{n_x}\) are exchangeable, i.e. swapping labels on observations keeps the data just as likely as the original.
Test statistic: the wilcoxon signed-rank test statistic is used, calculated by \(T=\sum^n_{i:d_i>0}r_i\times sign(d_i)\)
# t.test
test = t.test(data$uni_work,mu = mu)
diff = data$uni_work - mu
t0 = mean(diff)/sd(diff)*sqrt(n)
B = 10000
permuted_dat = dat
t_null = vector("numeric",B)
for(i in 1:B){
sign_permute = sample(c(-1,1),n,replace=TRUE)
d_permute = diff*sign_permute
t_null[i]=mean(d_permute)/sd(d_permute)*sqrt(n)
}
pval = mean(abs(t_null)>=abs(t0))Observed test statistic: the original test statistic is -1.2354766
p1 = data.frame(t_null) %>%
ggplot(aes(x = t_null)) +
geom_histogram(binwidth = 0.1) +
# theme_linedraw(base_size = 36) +
labs(y = "Count",x = "Test statistic")
p2 = data.frame(abs_t_null = abs(t_null)) %>%
ggplot(aes(x = abs_t_null)) +
geom_histogram(binwidth = 0.1) +
geom_vline(xintercept = abs(t0),
col = "red", lwd = 0.5)+
labs(y = "Count",x = "Absolute test statistic")
grid.arrange(p1,p2,nrow=2)Figure 16.5: Distribution of test statistics.
P-value: the p-value, \(P(T\geq|-2.74|)=0.2177,\) is the proportion of test statistics from randomly permuted data being more extreme than the test statistic we observed.
Decision:
Estimation: vs Hypothesis testing::
Estimation: a population parameter is unknown. Use the sample statistics to generate estimates of the population parameter.
Hypothesis testing: explicit statement regarding the population parameter. Test statistics are generated which will either support or reject the null hypothesis.
Confidence interval: a statement about the underlying population parameter. *(NB: confidence \(\neq\)probability)
for a 95% CI (a,b):
Bootstrapping:
Bootstrapping is useful when:
Advantages:
## [1] 28.34058
B=10000
result = vector("numeric", length=B)
for(i in 1:B){
newData = sample(data$uni_work,replace=TRUE)
result[i] = mean(newData)
}
(CI = quantile(result,c(0.025,0.975)))## 2.5% 97.5%
## 25.78986 30.97844
The data is resampled by 10000 times from the sample with replacement. Based on the bootstrapping result, our 95% confidence interval is between 25.79 and 30.98.
data.frame(result) %>%
ggplot(aes(x = result)) +
geom_histogram(binwidth = 0.5) +
geom_vline(xintercept = CI,
col = "red", lwd = 0.5) +
labs(y = "Count",x = "Mean")Figure 18.1: Histogram of boostraping results
sum = data %>% group_by(gender) %>%
summarise(Mean = mean(exercise),
Median = median(exercise),
SD = sd(exercise),
Variance = var(exercise),
n = n())
colnames(sum)=c("Gender","Mean","Median","SD","Variance","Counts")
knitr::kable(sum,digits = 1, align="cccccc",caption = "Statistics of number of hours spent on exercising by different genders.") %>% kable_paper(full_width = F, html_font = "Cambria")| Gender | Mean | Median | SD | Variance | Counts |
|---|---|---|---|---|---|
| Female | 3.4 | 3 | 2.9 | 8.4 | 46 |
| Male | 5.0 | 5 | 3.4 | 11.7 | 91 |
| Non-binary | 5.0 | 5 | NA | NA | 1 |
ggplot(data, aes(x= gender,y=exercise,fill=gender))+
geom_boxplot(aes(x = gender, y = exercise), outlier.alpha = 0.5)+labs(y = "Count", x = "Exercise")ggplot(data, aes(x= uni_work)) +
geom_histogram(bins = 12, show.legend = F) +
theme_linedraw(base_size = 18) +
labs(y = "Count", x = "Uni Work (hours)") +
geom_boxplot(aes(x = uni_work, y = 60), outlier.alpha = 0.5, width = 18) +
geom_vline(xintercept= 35,colour = "blue",linetype = "dashed") # the tested meanggplot(data %>% filter(gender != "Non-binary"), aes(x= exercise)) +
geom_histogram(aes(fill = gender), bins = 12, show.legend = F) +
facet_grid(rows = vars(gender)) +
theme_linedraw(base_size = 18) +
labs(y = "Count", x = "Exercise per week (hours)") +
geom_boxplot(aes(x = exercise, y = 45), outlier.alpha = 0, width = 18) +
geom_jitter(aes(x = exercise, y = 45, colour=gender, alpha = 0.5), height = 8, show.legend=F)Figure 19.1: Comparison of number of hours per week exercised between females and males
The upper and lower edges of the notches are at \(median \pm 1.58\times \frac{IQR}{\sqrt n}\)
Rule of thumb: if the notches of the two boxes do not overlap, this suggests that the medians are significantly different.
ggplot(data %>% filter(gender != "Non-binary"), aes(x = gender, y = exercise)) +
geom_boxplot(notch = TRUE) +
geom_dotplot(stackdir = "center",
binaxis = "y") +
theme_linedraw(base_size = 34) +
labs(y = "Heat of fusion (cal/g)",
x = "Method")# barchart
media_f = as.data.frame(table(data$social_media))
sort = arrange(media_f, Freq)
data %>% ggplot() +
aes(y = social_media,fill = gender, stat = "count") +
geom_bar() +
scale_y_discrete(name=" ", limits=as.character(sort$Var1)) +
labs(y = "", x = "Count")+theme(plot.title = element_text(hjust = 0.5))